library(dplyr) 
library(ggplot2)
library(tidyverse)
library(zoo)
library(knitr)
library(readxl)

url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
read_csv(url)
## # A tibble: 515,062 x 6
##    date       county      state      fips  cases deaths
##    <date>     <chr>       <chr>      <chr> <dbl>  <dbl>
##  1 2020-01-21 Snohomish   Washington 53061     1      0
##  2 2020-01-22 Snohomish   Washington 53061     1      0
##  3 2020-01-23 Snohomish   Washington 53061     1      0
##  4 2020-01-24 Cook        Illinois   17031     1      0
##  5 2020-01-24 Snohomish   Washington 53061     1      0
##  6 2020-01-25 Orange      California 06059     1      0
##  7 2020-01-25 Cook        Illinois   17031     1      0
##  8 2020-01-25 Snohomish   Washington 53061     1      0
##  9 2020-01-26 Maricopa    Arizona    04013     1      0
## 10 2020-01-26 Los Angeles California 06037     1      0
## # … with 515,052 more rows
source = "/Users/xingxin/Github/geog176a-lab2/data/"


covid<-read_csv(url)
head(covid)
## # A tibble: 6 x 6
##   date       county    state      fips  cases deaths
##   <date>     <chr>     <chr>      <chr> <dbl>  <dbl>
## 1 2020-01-21 Snohomish Washington 53061     1      0
## 2 2020-01-22 Snohomish Washington 53061     1      0
## 3 2020-01-23 Snohomish Washington 53061     1      0
## 4 2020-01-24 Cook      Illinois   17031     1      0
## 5 2020-01-24 Snohomish Washington 53061     1      0
## 6 2020-01-25 Orange    California 06059     1      0

Question 1

#  California, the daily new cases
dat = covid %>% 
  filter(state == "California") %>% 
  group_by(county) %>% 
  mutate(newCases = c(NA, diff(cases))) %>% 
  ungroup() %>% 
  filter(date == max(date)) 
#  the 5 counties with the most cumulative cases
most_cum_cases = dat %>% 
  slice_max(cases, n = 5) %>% 
  select(county, cases)

knitr::kable(most_cum_cases, 
             caption = "Most Cumulative Cases California Counties",
             col.names = c("County", "cumulative Cases"))
Most Cumulative Cases California Counties
County cumulative Cases
Los Angeles 249241
Riverside 54572
Orange 51016
San Bernardino 49691
San Diego 41140
# the 5 counties with the most NEW cases
most_new_cases = dat %>% 
  slice_max(newCases, n = 5) %>% 
  select(county, newCases)

knitr::kable(most_new_cases, 
             caption = "Most New Cases California Counties",
             col.names = c("County", "New Cases"))
Most New Cases California Counties
County New Cases
Riverside 585
San Diego 425
Los Angeles 420
Stanislaus 194
Orange 152
# import population data
PopulationEstimates <- read_excel(paste(source,"PopulationEstimates.xls",sep = ""), 
                                  skip = 2)
population_2019 = PopulationEstimates %>% 
  select(FIPStxt,POP_ESTIMATE_2019) 

# join the population data to the California COVID data.  
dat2 <-inner_join(dat, population_2019, by = c("fips" = "FIPStxt"))
#  the 5 counties with the most cumulative cases per capita
most_cum_cases_pc = dat2 %>%
  mutate(Cases_pc = cases/POP_ESTIMATE_2019) %>%
  slice_max(Cases_pc, n = 5) %>% 
  select(county, Cases_pc)

knitr::kable(most_cum_cases_pc, 
             caption = "Most Cumulative Cases California Counties Per Capita",
             col.names = c("County", "cumulative Cases Per Capita"))
Most Cumulative Cases California Counties Per Capita
County cumulative Cases Per Capita
Imperial 0.0610104
Kings 0.0449719
Kern 0.0335502
Tulare 0.0317807
Merced 0.0303047
# the 5 counties with the most NEW cases per capita
most_new_cases_pc = dat2 %>% 
  mutate(newCases_pc = newCases/POP_ESTIMATE_2019) %>%
  slice_max(newCases_pc, n = 5) %>% 
  select(county, newCases_pc)

knitr::kable(most_new_cases_pc, 
             caption = "Most New Cases California Counties Per Capit",
             col.names = c("County", "New Cases Per Capit"))
Most New Cases California Counties Per Capit
County New Cases Per Capit
Sutter 0.0006291
Glenn 0.0004931
Yuba 0.0004703
Madera 0.0004576
Stanislaus 0.0003523
# 14-days 
per_cap = covid %>% 
  filter(state == "California") %>% 
  group_by(county) %>% 
  mutate(newCases = c(NA, diff(cases))) %>% 
  ungroup() %>%
  filter(date >= max(date)-13)  %>%
  inner_join(population_2019, by = c("fips" = "FIPStxt"))

#Calculate Safe counties
dat_14 = per_cap %>%
  group_by(county,POP_ESTIMATE_2019) %>%
  summarize(tot_newCases = sum(newCases, na.rm = TRUE),tot_cumCases = sum(cases, na.rm = TRUE)) %>%  
  ungroup() %>%
  mutate(tot_safe_county = tot_newCases/(POP_ESTIMATE_2019/100000)) %>% 
  filter(tot_safe_county < 100)

Results of California

  1. the total number of cases: 7.46113^{5}

  2. the total number of new cases: 3424

  3. the total number of the total number of safe counties : 17

# Question 2

# facet plot of the daily new cases and the 7-day rolling mean
# select four states
four_states <- covid %>%
  filter(state %in% c("New York","California", "Louisiana", "Florida")) %>%
  group_by(state, date) %>%
  summarize(cases = sum(cases)) %>% 
  ungroup() %>% 
  group_by(state) %>% 
  mutate(newCases = cases - lag(cases),
         roll7 = rollmean(newCases, 7, fill = NA, align = 'right')) %>%
  ungroup() %>%
  filter(newCases > 0)

  ggplot(data = four_states,aes(x = date)) +
  geom_col(aes(y = newCases), col = NA, fill = "#F5B8B5") +  
  geom_line(aes(y = roll7), col = "darkred", size = 1) +  
  labs(x = "Date", y = "Daily New Cases",
       title = "New Reported Cases By Day ") +
  theme(plot.background = element_rect(fill = "white"),  
        panel.background = element_rect(fill = "white"),  
        plot.title = element_text(size = 14, face = 'bold')) +  
  facet_grid(~state, scales = "free_y")

# import population data
PopulationEstimates <- read_excel(paste(source,"PopulationEstimates.xls",sep = ""), 
                                  skip = 2)
population_2019 = PopulationEstimates %>% 
  select(Area_Name,POP_ESTIMATE_2019) 
# facet plot of the daily new cases and the 7-day rolling mean
# select four states per capita
four_states_pc <- population_2019 %>%
  right_join(four_states, by = c("Area_Name" = "state")) %>%
  mutate(newCases_pc = (newCases/POP_ESTIMATE_2019), 
         roll7 = rollmean(newCases_pc, 7, fill = NA, align = 'right')) %>%
  ungroup() %>%
  filter(newCases_pc > 0)

  ggplot(data = four_states_pc,aes(x = date)) +
  geom_col(aes(y = newCases_pc), col = NA, fill = "skyblue") +  
  geom_line(aes(y = roll7), col = "darkblue", size = 1) +  
  labs(x = "Date", y = "Daily New Cases Per Capita",
       title = "New Reported Cases Per Capita By Day ") +
  theme(plot.background = element_rect(fill = "white"),  
        panel.background = element_rect(fill = "white"),  
        plot.title = element_text(size = 14, face = 'bold')) +  
  facet_grid(~Area_Name, scales = "free_y") 

The absolute value of new cases in Louisiana is much lower than the other three states. However, the new cases per capita in Louisiana is close to the other states. The possible reason is that Louisiana has a much smaller populaition compared with other states.